In [1]:
%pylab
%matplotlib inline
In [2]:
cd ..
In [3]:
import sys
import numpy as np
import skimage
import cv2
import sklearn
import imp
In [4]:
import holoviews
In [5]:
import neukrill_net.utils
import neukrill_net.highlevelfeatures
import neukrill_net.stacked
In [6]:
import time
In [54]:
#%pdb
In [7]:
settings = neukrill_net.utils.Settings('settings.json')
In [8]:
X,y = settings.flattened_train_paths(settings.classes)
In [9]:
pkl_names = ['pftas.pkl','contourhistogram.pkl','contourmoments.pkl','haralick.pkl']
In [10]:
t0 = time.time()
hlf = []
XF_list = []
for pkl_name in pkl_names:
tmp = sklearn.externals.joblib.load('cache/'+pkl_name)
hlf += [tmp[0]]
XF_list += [tmp[1]]
print("Loading features took {}".format(time.time()-t0))
In [11]:
XF = np.concatenate(XF_list,2)
In [12]:
XF.shape
Out[12]:
In [13]:
XF[0,0,:]
Out[13]:
In [14]:
import sklearn.naive_bayes
In [15]:
clf = sklearn.naive_bayes.GaussianNB()
In [16]:
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
In [17]:
X_new = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=45).fit_transform(XF.squeeze(0), y)
In [18]:
my_X = X_new
clf = sklearn.naive_bayes.GaussianNB()
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
On original
In [19]:
import sklearn.ensemble
In [21]:
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12, random_state=42)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
This is similar to just the Contour Moments and Haralick features
On reduced
In [22]:
my_X = X_new
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12, random_state=42)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
Does slightly worse with fewer features.
Maybe it was too few?
In [23]:
my_X = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100).fit_transform(XF.squeeze(0), y)
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12, random_state=42)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
In [24]:
import neukrill_net.taxonomy
In [29]:
reload(neukrill_net.stacked)
Out[29]:
In [36]:
reload(neukrill_net.taxonomy)
Out[36]:
In [37]:
neukrill_net.taxonomy.taxonomy
Out[37]:
In [38]:
settings.classes
Out[38]:
In [39]:
marked_taxonomy = neukrill_net.stacked.propagate_labels_to_leaves(neukrill_net.taxonomy.taxonomy, settings.classes)
In [40]:
marked_taxonomy
Out[40]:
In [41]:
base_clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12, random_state=42)
hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_clf)
t0 = time.time()
hier_clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))
print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))
In [42]:
base_clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12, random_state=42)
hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_clf)
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
t0 = time.time()
hier_clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))
print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))
Try with a pipline to reduce the number of features at each level
In [44]:
import sklearn.pipeline
In [47]:
base_clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12, random_state=42)
best_filter = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100)
base_pipe = sklearn.pipeline.Pipeline([('filter', best_filter), ('clf', base_clf)])
hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_pipe)
my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)
t0 = time.time()
hier_clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))
print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))
In [47]:
base_clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12, random_state=42)
best_filter = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100)
base_pipe = sklearn.pipeline.Pipeline([('filter', best_filter), ('clf', base_clf)])
hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_pipe)
my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)
t0 = time.time()
hier_clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))
print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))
In [48]:
clf = sklearn.linear_model.LogisticRegression(random_state=42)
In [49]:
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
In [50]:
clf = sklearn.linear_model.LogisticRegression(random_state=42)
my_X = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100).fit_transform(XF.squeeze(0), y)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
In [55]:
base_clf = sklearn.linear_model.LogisticRegression(random_state=42)
hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_clf)
my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)
t0 = time.time()
hier_clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))
print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))
In [52]:
base_clf = sklearn.linear_model.LogisticRegression(random_state=42)
best_filter = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100)
base_pipe = sklearn.pipeline.Pipeline([('filter', best_filter), ('clf', base_clf)])
hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_pipe)
my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)
t0 = time.time()
hier_clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))
print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))
In [56]:
clf = sklearn.svm.SVC(kernel='linear', probability=True, random_state=42)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
In [57]:
clf = sklearn.svm.SVC(kernel='linear', probability=True, random_state=42)
my_X = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100).fit_transform(XF.squeeze(0), y)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
In [58]:
base_clf = sklearn.svm.SVC(kernel='linear', probability=True, random_state=42)
hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_clf)
my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)
t0 = time.time()
hier_clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))
print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))
In [59]:
base_clf = sklearn.svm.SVC(kernel='linear', probability=True, random_state=42)
best_filter = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100)
base_pipe = sklearn.pipeline.Pipeline([('filter', best_filter), ('clf', base_clf)])
hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_pipe)
my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)
t0 = time.time()
hier_clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))
print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))
one-vs-one
In [60]:
clf = sklearn.svm.SVC(probability=True, random_state=42)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
In [61]:
clf = sklearn.svm.SVC(kernel='rbf', probability=True, random_state=42)
my_X = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100).fit_transform(XF.squeeze(0), y)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
In [62]:
base_clf = sklearn.svm.SVC(kernel='rbf', probability=True, random_state=42)
hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_clf)
my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)
t0 = time.time()
hier_clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))
print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))
In [63]:
base_clf = sklearn.svm.SVC(kernel='rbf', probability=True, random_state=42)
best_filter = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100)
base_pipe = sklearn.pipeline.Pipeline([('filter', best_filter), ('clf', base_clf)])
hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_pipe)
my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)
t0 = time.time()
hier_clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))
print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))
In [ ]: